In [1]:
import pandas as pd
In [2]:
violations_df = pd.read_csv("violations.csv")
#violations_df = pd.read_csv("violations.csv", nrows=100)
In [3]:
# 1. I want to make sure my Plate ID is a string. Can't lose the leading zeroes!
#3. I want the dates to be dates! Read the read_csv documentation to find out how to make pandas automatically parse dates.
col_types = { 'Plate ID': 'str'}
dof_violations_df = pd.read_csv("DOF_Parking_Violation_Codes.csv", dtype=col_types, parse_dates=True)
#dof_violations_df = pd.read_csv("DOF_Parking_Violation_Codes.csv", dtype=col_types, parse_dates=True, nrows=100)
In [4]:
# print(violations_df.columns)
In [5]:
# len(violations_df)
In [6]:
# print(dof_violations_df.head())
In [7]:
violations_df = violations_df[violations_df['Vehicle Year'] != 0]
# print(violations_df['Vehicle Year'].head())
In [8]:
# print(violations_df['Date First Observed'].head())
In [9]:
import datetime
violations_df.head()['Issue Date'].astype(datetime.datetime)
Out[9]:
In [10]:
import datetime
violations_df.head()['Issue Date'].astype(datetime.datetime)
#violations_df['Issue Date'] = violations_df['Issue Date'].astype('datetime64[ns]')
Out[10]:
In [11]:
# violations_df.dtypes
In [12]:
import re
# violations_df['Violation Time'].head()
In [13]:
import numpy as np
# Build a function using that method
def time_to_datetime(str_time):
try:
#str_time = re.sub('^0', '', str_time)
if isinstance(str_time, str):
str_time = str_time + "M"
#print("Trying to convert", str_time, "into a time")
return datetime.datetime.strptime(str_time.strip(), "%I%M%p").time()
#try:
# if str_time == -999:
# print("It's -999")
# return np.nan
except:
return np.nan
In [14]:
# Apply that method to the 'Time' column of the dataframe
violations_df['Violation Time'] = violations_df['Violation Time'].apply(time_to_datetime)
In [15]:
print(violations_df['Violation Time'].head())
In [16]:
# def remove_1900(year):
# year = str(year)
# year = re.sub('^1900-01-01', '', year)
# return year
# print(violations_df['Violation Time'].apply(remove_1900))
In [17]:
import re
In [18]:
def vehicle_colors(vehicle):
if isinstance(vehicle, str):
#print(vehicle)
vehicle = re.sub('^GY$', 'GREY', vehicle)
#vehicle = vehicle.replace("GY", "GREY")
vehicle = re.sub('^WH$', 'WHITE', vehicle)
vehicle = re.sub('^BR$', 'BROWN', vehicle)
vehicle = re.sub('^RD$', 'RED', vehicle)
vehicle = re.sub('^B[LK]$', 'BLACK', vehicle)
vehicle = re.sub('^TN$', 'TAN', vehicle)
vehicle = re.sub('^YW$', 'YELLOW', vehicle)
vehicle = re.sub('^SIL$', 'SILVER', vehicle)
vehicle = re.sub('^GR$', 'GREEN', vehicle)
vehicle = re.sub('^SILVE$', 'SILVER', vehicle)
# vehicle = vehicle.replace("WH","WHITE")
# vehicle = vehicle.replace("BR","BROWN")
# vehicle = vehicle.replace("RD","RED")
# vehicle = vehicle.replace("BL","BLACK")
# vehicle = vehicle.replace("BK","BLACK")
# vehicle = vehicle.replace("TN","TAN")
# vehicle = vehicle.replace("YW","YELLOW")
# vehicle = re.sub('SIL$', 'SILVER', vehicle)
# vehicle = vehicle.replace("SILVR","SILVER")
# vehicle = vehicle.replace("SIL","SILVER")
return vehicle
violations_df['Vehicle Color'] = violations_df['Vehicle Color'].apply(vehicle_colors)
In [19]:
violations_df['Vehicle Color'].head()
Out[19]:
In [20]:
print(violations_df.columns)
violations_df['Violation Code'].head()
Out[20]:
In [21]:
violations_df
Out[21]:
In [22]:
print(dof_violations_df.columns)
dof_violations_df['CODE'].head()
Out[22]:
In [23]:
# violations_df.merge(dof_violations_df, left_on='Violation Code', right_on='CODE')
In [24]:
#print(dof_violations_df['CODE'][0])
#test = dof_violations_df['CODE'][0]
def to_int(test):
try:
# test = re.sub('$', '', test)
test = test.strip("$")
#print(test)
test = int(test)
#print(test, " is now an int")
return test
except:
print(test, " coult not be converted to an int")
return np.nan
In [25]:
dof_violations_df['CODE'] = dof_violations_df['CODE'].apply(to_int)
In [26]:
violations_df = violations_df.merge(dof_violations_df, left_on='Violation Code', right_on='CODE')
In [27]:
violations_df.describe()
Out[27]:
In [28]:
violations_df.columns
Out[28]:
In [29]:
violations_df['Summons Number']
Out[29]:
In [30]:
violations_df['Manhattan\xa0 96th St. & below']
Out[30]:
In [31]:
violations_df['Manhattan\xa0 96th St. & below'] = violations_df['Manhattan\xa0 96th St. & below'].apply(to_int)
In [32]:
violations_df['Manhattan\xa0 96th St. & below'].sum()
Out[32]:
In [33]:
violations_df['All Other Areas'] = violations_df['All Other Areas'].apply(to_int)
In [34]:
violations_df['All Other Areas'].sum()
Out[34]:
In [35]:
violations_df['Manhattan\xa0 96th St. & below'].describe()
Out[35]:
In [36]:
violations_df.groupby('CODE')['All Other Areas'].mean().sort_values(ascending=False).head(8)
Out[36]:
The most frequent parking violation is:
In [37]:
import matplotlib.pyplot as plt
%matplotlib inline
freqNYviolations = violations_df.groupby('CODE')['All Other Areas'].count().sort_values(ascending=False).head().plot(kind="bar", color = ['#624ea7', '#599ad3', '#f9a65a', '#9e66ab', 'purple'])
freqNYviolations.set_title('Most Frequent NYC Parking Violations by Code')
freqNYviolations.set_xlabel('Parking Violation Code')
freqNYviolations.set_ylabel('Frequency of Citations')
plt.savefig('freqParkingViolations.png')
In [38]:
violations_df.groupby('CODE')['All Other Areas'].count().sort_values(ascending=False).head(1)
Out[38]:
In [39]:
# violations_df.sort_values('All Other Areas', ascending=False)
In [40]:
nj_violations_df = violations_df[violations_df['Registration State'] != 'NJ']
print("There were", len(nj_violations_df['All Other Areas']), "parking violations in NYC by drivers registered in New Jersey")
print("These violations totaled", nj_violations_df['All Other Areas'].sum(), "of revenue")
How much money does NYC make off of all non-New York vehicles?
In [41]:
violations_df.groupby('Registration State')['All Other Areas'].sum().sort_values(ascending=False).head(8)
Out[41]:
In [42]:
import matplotlib.pyplot as plt
%matplotlib inline
nonny_violations_df = violations_df[violations_df['Registration State'] != 'NY']
nonny_violations_df = nonny_violations_df[violations_df['Registration State'] != '99']
nonNYviolations = nonny_violations_df.groupby('Registration State')['All Other Areas'].sum().sort_values(ascending=False).head(20).plot(kind="bar", color = ['#624ea7', '#599ad3', '#f9a65a', '#9e66ab', 'purple'])
nonNYviolations.set_title('Money Earned from NYC Parking Violations by State')
nonNYviolations.set_xlabel('Vehicle Registration State (NON-NYS)')
nonNYviolations.set_ylabel('Amount Earned')
plt.savefig('nonNYParkingViolations.png')
In [43]:
import matplotlib.pyplot as plt
%matplotlib inline
nonny_violations_df = violations_df[violations_df['Registration State'] != 'NY']
nonny_violations_df = nonny_violations_df[violations_df['Registration State'] != '99']
freqnonNYviolations = nonny_violations_df.groupby('Registration State')['All Other Areas'].count().sort_values(ascending=False).head(20).plot(kind="bar", color = ['#624ea7', '#599ad3', '#f9a65a', '#9e66ab', 'purple'])
freqnonNYviolations.set_title('Frequency of NYC Parking Violations by State')
freqnonNYviolations.set_xlabel('Vehicle Registration State (NON-NYS)')
freqnonNYviolations.set_ylabel('Number of Citations')
plt.savefig('freqnonNYParkingViolations.png')
In [48]:
print("The average ticket cost in NYC is $", violations_df['All Other Areas'].mean())
In [51]:
violations_df.groupby('Issue Date')['All Other Areas'].count().sort_values(ascending=False)
Out[51]:
In [ ]:
freqnonNYviolations = violations_df.groupby('Issue Date')['All Other Areas'].count().plot(kind="bar", color = ['#624ea7', '#599ad3', '#f9a65a', '#9e66ab', 'purple'])
freqnonNYviolations.set_title('Frequency of NYC Parking Violations by Date')
freqnonNYviolations.set_xlabel('Date')
freqnonNYviolations.set_ylabel('Number of Citations')
plt.savefig('datedfreqnonNYParkingViolations.png')
In [55]:
freqnonNYviolations = violations_df.groupby('Issue Date')['All Other Areas'].sum().plot(kind="bar", color = ['#624ea7', '#599ad3', '#f9a65a', '#9e66ab', 'purple'])
freqnonNYviolations.set_title('Revenue of NYC Parking Violations by Date')
freqnonNYviolations.set_xlabel('Date')
freqnonNYviolations.set_ylabel('Revenue Generated from Citations')
plt.savefig('datedrevenueonNYParkingViolations.png')
In [ ]:
16. Manually construct a dataframe out of https://dmv.ny.gov/statistic/2015licinforce-web.pdf (only NYC boroughts - bronx, queens, manhattan, staten island, brooklyn), having columns for borough name, abbreviation, and number of licensed drivers.
17. What's the parking-ticket-$-per-licensed-driver in each borough of NYC? Do this with pandas and the dataframe you just made, not with your head!